<?php
/* 
遵循GPL2.0开源协议
更新:2020/10/27
作者:黄豆 & 292951110@qq.com
使用方法:UTSW::SplitWord
基于基础词库并结合自定义词库进行分词的系统，也可进行敏感词检测。
训练并添加词组:UTSW::AddWord
www.usualtool.com  QQ群交流:583610949
*/
/*定义基本词库位置*/
DEFINE('WORD','dict/china.utsw');
/*定义自定义词库位置*/
DEFINE('TRAIN_WORD','dict/china.train.utsw');
class UTSW{
    /*
    分隔并计算关键词
    UTSW::SplitWord($content,$title)
    输入一个标题及内容，标题可以为空
    得到3个数组，标题关键词组、与标题关键词组相关段落的关键词组、全文关键词组
    */
    function SplitWord($content,$title=''){
        $word = file_get_contents(TRAIN_WORD).",".file_get_contents(WORD);
        $tags_array = explode(',', $word);
        $t_tags = array();
        $c_tags = array();
        $n_tags = array();
        $n_content = array();
        $content=UTSW::DeleteHtml($content);
        if(!empty($title)){
            $title=UTSW::DeleteHtml($title);
            foreach($tags_array as $t_tag) {
                if(strpos($title, $t_tag) !== false){
                    $t_tags[] = $t_tag;
                }
            }
            foreach($t_tags as $key) {
                $n_content[]=UTSW::WordInarray($key,$content);
            }
            $n_content=implode("|->|",array_unique($n_content));
            foreach($tags_array as $n_tag) {
                if(strpos($n_content, $n_tag) !== false){
                    $n_tags[] = $n_tag;
                }
            }
        }
        foreach($tags_array as $c_tag) {
            if(strpos($content, $c_tag) !== false){
                $c_tags[] = $c_tag;
            }
        }
        if(!empty($title)){
            $tags=array("t_tags"=>array_unique($t_tags),"n_tags"=>array_unique($n_tags),"c_tags"=>array_unique($c_tags));
        }else{
            $tags=array("c_tags"=>array_unique($c_tags));
        }
        return $tags;
    }
    /*
    查询关键词在文本中出现的位置并重新组成一个新的文本
    WordInarray($keyword,$content)
    */
    function WordInarray($keyword,$content){
        $n_content=array();
        $content=UTSW::SplitContent($content);
        foreach($content as $value) {
           if(strpos($value,$keyword)!==false){
               $n_content[] =$value;
           }
        }
        return implode("|->|",$n_content);
    }
    /*
    分隔文本段落
    SplitContent($content)
    */
    function SplitContent($content){
        $symbol=array("，","。","？","！","……",",",".","!","?");
        $content=str_replace($symbol,"|->|",$content);
        $content=explode("|->|",$content);
        return $content;
    }
    /*
    清除文本中的HTML代码
    DeleteHtml($content)
    */
    function DeleteHtml($content){
        $content = strip_tags($content,"");
        $content = str_replace(array("\r\n", "\r", "\n"), "", $content);   
        $content = str_replace("　","",$content);
        $content = str_replace("&nbsp;","",$content);
        $content = str_replace(" ","",$content);
        return ltrim(trim($content));
    }
    /*
    向自定义词库增加词组
    AddWord($keyword)
    添加成功返回1，有重复值返回0
    */
    function AddWord($keyword){
        $word = file_get_contents(TRAIN_WORD);
        $words = $word.",".file_get_contents(WORD);
        if(strpos($words,$keyword) !== false){
            return 0;
        }else{
            $word = str_replace("--END--","".$keyword.",--END--",$word);
            file_put_contents(TRAIN_WORD,$word);
            return 1;
        }
    }
}